data_test <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-evaluation-data.csv", header = TRUE) %>%select(-INDEX)
data_train <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)[1] 2276 16
[1] 259 15
TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
"integer" "integer" "integer" "integer"
TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
"integer" "integer" "integer" "integer"
TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
"integer" "integer" "integer" "integer"
TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
"integer" "integer" "integer" "integer"
| Name | data_train |
| Number of rows | 2276 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TARGET_WINS | 0 | 1.00 | 80.79 | 15.75 | 0 | 71.0 | 82.0 | 92.00 | 146 | ▁▁▇▅▁ |
| TEAM_BATTING_H | 0 | 1.00 | 1469.27 | 144.59 | 891 | 1383.0 | 1454.0 | 1537.25 | 2554 | ▁▇▂▁▁ |
| TEAM_BATTING_2B | 0 | 1.00 | 241.25 | 46.80 | 69 | 208.0 | 238.0 | 273.00 | 458 | ▁▆▇▂▁ |
| TEAM_BATTING_3B | 0 | 1.00 | 55.25 | 27.94 | 0 | 34.0 | 47.0 | 72.00 | 223 | ▇▇▂▁▁ |
| TEAM_BATTING_HR | 0 | 1.00 | 99.61 | 60.55 | 0 | 42.0 | 102.0 | 147.00 | 264 | ▇▆▇▅▁ |
| TEAM_BATTING_BB | 0 | 1.00 | 501.56 | 122.67 | 0 | 451.0 | 512.0 | 580.00 | 878 | ▁▁▇▇▁ |
| TEAM_BATTING_SO | 102 | 0.96 | 735.61 | 248.53 | 0 | 548.0 | 750.0 | 930.00 | 1399 | ▁▆▇▇▁ |
| TEAM_BASERUN_SB | 131 | 0.94 | 124.76 | 87.79 | 0 | 66.0 | 101.0 | 156.00 | 697 | ▇▃▁▁▁ |
| TEAM_BASERUN_CS | 772 | 0.66 | 52.80 | 22.96 | 0 | 38.0 | 49.0 | 62.00 | 201 | ▃▇▁▁▁ |
| TEAM_BATTING_HBP | 2085 | 0.08 | 59.36 | 12.97 | 29 | 50.5 | 58.0 | 67.00 | 95 | ▂▇▇▅▁ |
| TEAM_PITCHING_H | 0 | 1.00 | 1779.21 | 1406.84 | 1137 | 1419.0 | 1518.0 | 1682.50 | 30132 | ▇▁▁▁▁ |
| TEAM_PITCHING_HR | 0 | 1.00 | 105.70 | 61.30 | 0 | 50.0 | 107.0 | 150.00 | 343 | ▇▇▆▁▁ |
| TEAM_PITCHING_BB | 0 | 1.00 | 553.01 | 166.36 | 0 | 476.0 | 536.5 | 611.00 | 3645 | ▇▁▁▁▁ |
| TEAM_PITCHING_SO | 102 | 0.96 | 817.73 | 553.09 | 0 | 615.0 | 813.5 | 968.00 | 19278 | ▇▁▁▁▁ |
| TEAM_FIELDING_E | 0 | 1.00 | 246.48 | 227.77 | 65 | 127.0 | 159.0 | 249.25 | 1898 | ▇▁▁▁▁ |
| TEAM_FIELDING_DP | 286 | 0.87 | 146.39 | 26.23 | 52 | 131.0 | 149.0 | 164.00 | 228 | ▁▂▇▆▁ |
| Name | data_test |
| Number of rows | 259 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TEAM_BATTING_H | 0 | 1.00 | 1469.39 | 150.66 | 819 | 1387.0 | 1455.0 | 1548.00 | 2170 | ▁▂▇▁▁ |
| TEAM_BATTING_2B | 0 | 1.00 | 241.32 | 49.52 | 44 | 210.0 | 239.0 | 278.50 | 376 | ▁▂▇▇▂ |
| TEAM_BATTING_3B | 0 | 1.00 | 55.91 | 27.14 | 14 | 35.0 | 52.0 | 72.00 | 155 | ▇▇▃▁▁ |
| TEAM_BATTING_HR | 0 | 1.00 | 95.63 | 56.33 | 0 | 44.5 | 101.0 | 135.50 | 242 | ▆▅▇▃▁ |
| TEAM_BATTING_BB | 0 | 1.00 | 498.96 | 120.59 | 15 | 436.5 | 509.0 | 565.50 | 792 | ▁▁▅▇▁ |
| TEAM_BATTING_SO | 18 | 0.93 | 709.34 | 243.11 | 0 | 545.0 | 686.0 | 912.00 | 1268 | ▁▃▇▇▂ |
| TEAM_BASERUN_SB | 13 | 0.95 | 123.70 | 93.39 | 0 | 59.0 | 92.0 | 151.75 | 580 | ▇▃▁▁▁ |
| TEAM_BASERUN_CS | 87 | 0.66 | 52.32 | 23.10 | 0 | 38.0 | 49.5 | 63.00 | 154 | ▂▇▃▁▁ |
| TEAM_BATTING_HBP | 240 | 0.07 | 62.37 | 12.71 | 42 | 53.5 | 62.0 | 67.50 | 96 | ▃▇▅▁▁ |
| TEAM_PITCHING_H | 0 | 1.00 | 1813.46 | 1662.91 | 1155 | 1426.5 | 1515.0 | 1681.00 | 22768 | ▇▁▁▁▁ |
| TEAM_PITCHING_HR | 0 | 1.00 | 102.15 | 57.65 | 0 | 52.0 | 104.0 | 142.50 | 336 | ▇▇▆▁▁ |
| TEAM_PITCHING_BB | 0 | 1.00 | 552.42 | 172.95 | 136 | 471.0 | 526.0 | 606.50 | 2008 | ▆▇▁▁▁ |
| TEAM_PITCHING_SO | 18 | 0.93 | 799.67 | 634.31 | 0 | 613.0 | 745.0 | 938.00 | 9963 | ▇▁▁▁▁ |
| TEAM_FIELDING_E | 0 | 1.00 | 249.75 | 230.90 | 73 | 131.0 | 163.0 | 252.00 | 1568 | ▇▁▁▁▁ |
| TEAM_FIELDING_DP | 31 | 0.88 | 146.06 | 25.88 | 69 | 131.0 | 148.0 | 164.00 | 204 | ▁▂▇▇▂ |
data_train %>%
gather(variable, value) %>%
filter(is.na(value)) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(data_train) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
rename(`Variable Missing Data` = variable,
`Number of Records` = n,
`Share of Total` = percent) %>%
kable() %>%
kable_styling()| Variable Missing Data | Number of Records | Share of Total |
|---|---|---|
| TEAM_BATTING_HBP | 2085 | 92% |
| TEAM_BASERUN_CS | 772 | 34% |
| TEAM_FIELDING_DP | 286 | 13% |
| TEAM_BASERUN_SB | 131 | 5.8% |
| TEAM_BATTING_SO | 102 | 4.5% |
| TEAM_PITCHING_SO | 102 | 4.5% |
data_test %>%
gather(variable, value) %>%
filter(is.na(value)) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(data_train) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
rename(`Variable Missing Data` = variable,
`Number of Records` = n,
`Share of Total` = percent) %>%
kable() %>%
kable_styling()| Variable Missing Data | Number of Records | Share of Total |
|---|---|---|
| TEAM_BATTING_HBP | 240 | 11% |
| TEAM_BASERUN_CS | 87 | 3.8% |
| TEAM_FIELDING_DP | 31 | 1.4% |
| TEAM_BATTING_SO | 18 | 0.8% |
| TEAM_PITCHING_SO | 18 | 0.8% |
| TEAM_BASERUN_SB | 13 | 0.6% |
data_train %>%
gather() %>%
ggplot( aes(x= value)) +
geom_density(fill='pink') +
facet_wrap(~key, scales = 'free')Let’s take a closer look at the TEAM_BASERUN_SB
data_train %>%
ggplot(aes(TEAM_BASERUN_SB)) +
geom_histogram(bins = 50, fill = 'pink') +
geom_vline(aes(xintercept = mean(TEAM_BASERUN_SB, na.rm = T)), col = "red", lty = 2) +
geom_vline(aes(xintercept = median(TEAM_BASERUN_SB, na.rm = T)), col = "blue", lty = 2) +
labs(x = element_blank(),
y = "Count",
title = "Distribution of Stolen Bases",
caption = "* Red line is the mean value and blue is the median") +
theme_classic()ggplot(data_train, aes(x=TARGET_WINS)) +
geom_histogram(aes(y=..density..),binwidth = 3, colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic()Correlations with Response Variable Let’s take a look at how the predictors are correlated with the response variable:
q <- cor(data_train)
ggcorrplot(q, type = "upper", outline.color = "white",
ggtheme = theme_classic,
colors = c("#6D9EC1", "white", "#E46726"),
lab = TRUE, show.legend = FALSE, tl.cex = 8, lab_size = 3) data_train %>%
gather(variable, value, -TARGET_WINS) %>%
ggplot(., aes(value, TARGET_WINS)) +
geom_point(fill = "pink", color="pink") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = "Wins")[1] 0.09550747
data_train <- data_train %>%
mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
select(-TEAM_BATTING_HBP)
set.seed(42)
knn <- data_train %>% knnImputation()
impute_me <- is.na(data_train$TEAM_BATTING_SO)
data_train[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"]
impute_me <- is.na(data_train$TEAM_BASERUN_SB)
data_train[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"]
impute_me <- is.na(data_train$TEAM_BASERUN_CS)
data_train[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"]
impute_me <- is.na(data_train$TEAM_PITCHING_SO)
data_train[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_train$TEAM_FIELDING_DP)
data_train[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"][1] 0
Do the same for the data_test
[1] 0.1047619
data_test <- data_test %>%
mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
select(-TEAM_BATTING_HBP)
set.seed(42)
knn <- data_test %>% knnImputation()
impute_me <- is.na(data_test$TEAM_BATTING_SO)
data_test[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"]
impute_me <- is.na(data_test$TEAM_BASERUN_SB)
data_test[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"]
impute_me <- is.na(data_test$TEAM_BASERUN_CS)
data_test[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"]
impute_me <- is.na(data_test$TEAM_PITCHING_SO)
data_test[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_test$TEAM_FIELDING_DP)
data_test[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"][1] 0
Call:
lm(formula = TARGET_WINS ~ ., data = data_train, na.action = na.omit)
Residuals:
Min 1Q Median 3Q Max
-49.112 -8.463 0.047 8.364 61.603
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.9438573 5.8754934 1.182 0.237396
TEAM_BATTING_H 0.0521644 0.0037624 13.865 < 2e-16 ***
TEAM_BATTING_2B -0.0146400 0.0091385 -1.602 0.109290
TEAM_BATTING_3B 0.0383492 0.0172017 2.229 0.025886 *
TEAM_BATTING_HR 0.0768270 0.0281262 2.732 0.006354 **
TEAM_BATTING_BB 0.0013952 0.0049562 0.281 0.778354
TEAM_BATTING_SO -0.0019862 0.0033961 -0.585 0.558720
TEAM_BASERUN_SB 0.0040365 0.0054119 0.746 0.455838
TEAM_BASERUN_CS 0.0973314 0.0155751 6.249 4.92e-10 ***
TEAM_PITCHING_H -0.0004574 0.0003778 -1.211 0.226086
TEAM_PITCHING_HR 0.0030711 0.0247709 0.124 0.901341
TEAM_PITCHING_BB 0.0108895 0.0032308 3.371 0.000763 ***
TEAM_PITCHING_SO -0.0023276 0.0022387 -1.040 0.298585
TEAM_FIELDING_E -0.0216762 0.0024116 -8.988 < 2e-16 ***
TEAM_FIELDING_DP -0.0961580 0.0139737 -6.881 7.65e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.95 on 2261 degrees of freedom
Multiple R-squared: 0.3284, Adjusted R-squared: 0.3242
F-statistic: 78.96 on 14 and 2261 DF, p-value: < 2.2e-16
m2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR +TEAM_BATTING_3B +TEAM_BASERUN_CS+TEAM_FIELDING_E+ TEAM_FIELDING_DP, data = data_train)
summary(m2)
Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR +
TEAM_BATTING_3B + TEAM_BASERUN_CS + TEAM_FIELDING_E + TEAM_FIELDING_DP,
data = data_train)
Residuals:
Min 1Q Median 3Q Max
-46.717 -8.663 -0.059 8.538 65.935
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.473998 3.915403 1.143 0.253298
TEAM_BATTING_H 0.051169 0.002467 20.741 < 2e-16 ***
TEAM_BATTING_HR 0.073968 0.007638 9.684 < 2e-16 ***
TEAM_BATTING_3B 0.061358 0.016375 3.747 0.000183 ***
TEAM_BASERUN_CS 0.107941 0.012081 8.935 < 2e-16 ***
TEAM_FIELDING_E -0.023632 0.001584 -14.924 < 2e-16 ***
TEAM_FIELDING_DP -0.078465 0.013791 -5.690 1.44e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.06 on 2269 degrees of freedom
Multiple R-squared: 0.3144, Adjusted R-squared: 0.3126
F-statistic: 173.4 on 6 and 2269 DF, p-value: < 2.2e-16
full_formula <- "TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP + I(TEAM_BATTING_2B^2) + I(TEAM_BATTING_3B^2) + I(TEAM_BATTING_HR^2) + I(TEAM_BATTING_BB^2) + I(TEAM_BATTING_SO^2) + I(TEAM_BASERUN_SB^2) + I(TEAM_BASERUN_CS^2) + I(TEAM_PITCHING_H^2) + I(TEAM_PITCHING_HR^2) + I(TEAM_PITCHING_BB^2) + I(TEAM_PITCHING_SO^2) + I(TEAM_FIELDING_E^2) + I(TEAM_FIELDING_DP^2) + I(TEAM_BATTING_2B^3) + I(TEAM_BATTING_3B^3) + I(TEAM_BATTING_HR^3) + I(TEAM_BATTING_BB^3) + I(TEAM_BATTING_SO^3) + I(TEAM_BASERUN_SB^3) + I(TEAM_BASERUN_CS^3) + I(TEAM_PITCHING_H^3) + I(TEAM_PITCHING_HR^3) + I(TEAM_PITCHING_BB^3) + I(TEAM_PITCHING_SO^3) + I(TEAM_FIELDING_E^3) + I(TEAM_FIELDING_DP^3) + I(TEAM_BATTING_2B^4) + I(TEAM_BATTING_3B^4) + I(TEAM_BATTING_HR^4) + I(TEAM_BATTING_BB^4) + I(TEAM_BATTING_SO^4) + I(TEAM_BASERUN_SB^4) + I(TEAM_BASERUN_CS^4) + I(TEAM_PITCHING_H^4) + I(TEAM_PITCHING_HR^4) + I(TEAM_PITCHING_BB^4) + I(TEAM_PITCHING_SO^4) + I(TEAM_FIELDING_E^4) + I(TEAM_FIELDING_DP^4) "
m3 <- lm(full_formula, data_train)
step_back <- MASS::stepAIC(m3, direction="backward", trace = F)
poly_call <- summary(step_back)$call
step_back <- lm(poly_call[2], data_train)
summary(step_back)
Call:
lm(formula = poly_call[2], data = data_train)
Residuals:
Min 1Q Median 3Q Max
-54.736 -7.452 0.017 7.344 61.619
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.055e+01 5.511e+01 -0.191 0.848198
TEAM_BATTING_2B 1.519e+00 1.809e-01 8.396 < 2e-16 ***
TEAM_BATTING_3B 3.764e-01 1.871e-01 2.012 0.044312 *
TEAM_BATTING_BB 7.492e-01 9.617e-02 7.790 1.02e-14 ***
TEAM_BASERUN_SB 4.464e-02 1.191e-02 3.748 0.000182 ***
TEAM_PITCHING_H 4.189e-02 3.584e-03 11.689 < 2e-16 ***
TEAM_PITCHING_BB -6.975e-02 1.195e-02 -5.836 6.14e-09 ***
TEAM_PITCHING_SO -2.545e-02 4.404e-03 -5.779 8.57e-09 ***
TEAM_FIELDING_E -2.289e-01 2.252e-02 -10.166 < 2e-16 ***
TEAM_FIELDING_DP -3.639e+00 1.629e+00 -2.233 0.025639 *
I(TEAM_BATTING_2B^2) -5.951e-03 7.210e-04 -8.254 2.58e-16 ***
I(TEAM_BATTING_3B^2) -5.323e-03 3.696e-03 -1.440 0.149948
I(TEAM_BATTING_HR^2) -2.369e-03 9.823e-04 -2.412 0.015967 *
I(TEAM_BATTING_BB^2) -2.499e-03 3.194e-04 -7.825 7.78e-15 ***
I(TEAM_BATTING_SO^2) 3.270e-05 8.862e-06 3.690 0.000229 ***
I(TEAM_BASERUN_CS^2) -1.157e-03 8.030e-04 -1.441 0.149837
I(TEAM_PITCHING_H^2) -4.327e-06 4.723e-07 -9.161 < 2e-16 ***
I(TEAM_PITCHING_HR^2) 3.015e-03 7.715e-04 3.909 9.56e-05 ***
I(TEAM_PITCHING_SO^2) 2.245e-06 1.159e-06 1.937 0.052925 .
I(TEAM_FIELDING_E^2) 3.929e-04 5.385e-05 7.296 4.10e-13 ***
I(TEAM_FIELDING_DP^2) 3.887e-02 1.806e-02 2.153 0.031455 *
I(TEAM_BATTING_2B^3) 7.573e-06 9.411e-07 8.047 1.36e-15 ***
I(TEAM_BATTING_3B^3) 4.442e-05 2.803e-05 1.585 0.113173
I(TEAM_BATTING_HR^3) 1.575e-05 7.177e-06 2.194 0.028302 *
I(TEAM_BATTING_BB^3) 3.690e-06 4.879e-07 7.564 5.68e-14 ***
I(TEAM_BATTING_SO^3) -2.345e-08 5.875e-09 -3.992 6.76e-05 ***
I(TEAM_BASERUN_SB^3) -2.227e-07 1.443e-07 -1.543 0.122933
I(TEAM_BASERUN_CS^3) 2.134e-05 9.901e-06 2.156 0.031216 *
I(TEAM_PITCHING_H^3) 1.688e-10 2.696e-11 6.262 4.54e-10 ***
I(TEAM_PITCHING_HR^3) -1.683e-05 4.989e-06 -3.374 0.000753 ***
I(TEAM_PITCHING_BB^3) 1.061e-08 2.997e-09 3.540 0.000408 ***
I(TEAM_FIELDING_E^3) -3.019e-07 4.806e-08 -6.281 4.02e-10 ***
I(TEAM_FIELDING_DP^3) -1.854e-04 8.639e-05 -2.146 0.031983 *
I(TEAM_BATTING_3B^4) -1.265e-07 6.979e-08 -1.813 0.069995 .
I(TEAM_BATTING_HR^4) -2.933e-08 1.495e-08 -1.962 0.049907 *
I(TEAM_BATTING_BB^4) -1.874e-09 2.652e-10 -7.064 2.16e-12 ***
I(TEAM_BASERUN_SB^4) 3.276e-10 2.064e-10 1.587 0.112665
I(TEAM_BASERUN_CS^4) -6.719e-08 3.306e-08 -2.032 0.042243 *
I(TEAM_PITCHING_H^4) -2.130e-15 4.961e-16 -4.295 1.82e-05 ***
I(TEAM_PITCHING_HR^4) 2.649e-08 8.868e-09 2.988 0.002842 **
I(TEAM_PITCHING_BB^4) -1.522e-12 6.952e-13 -2.189 0.028685 *
I(TEAM_FIELDING_E^4) 7.607e-11 1.416e-11 5.371 8.66e-08 ***
I(TEAM_FIELDING_DP^4) 3.253e-07 1.511e-07 2.153 0.031439 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 11.95 on 2233 degrees of freedom
Multiple R-squared: 0.4351, Adjusted R-squared: 0.4245
F-statistic: 40.95 on 42 and 2233 DF, p-value: < 2.2e-16
In order to select which model is the “best” we will test it against a test set. We will examine the difference between the predicted and actual values. Since the wins are in whole numbers and the predict function will generate floating point numbers, I will be rounding the results.